# Code updated: 2021 April 27

#### DESCRIPTION ####
#
# Generates and saves (as .pdf) figures:
# (1)   Weekly time-series of cumulative household fraction with any irrigation violation notices
# (2)   Regression discontinuity first stage for violation notices
# (3)   Regression discontinuity reduced-form for post-treatment total weekly consumption during all hours
# (4A)  Regression discontinuity reduced-form for post-treatment total consumption during irrigation *non-allowed* hours
# (4B)  Regression discontinuity reduced-form for post-treatment total consumption during irrigation *allowed* hours
# (5)   Monthly time-series of household average weekly consumption during all three regimes, by RCT arm
# (A4)  Distributions of pre-treatment consumption by WaterSmart arm
# (A5)  Regression discontinuity distribution check for bunching (McCrary test)
# (A6)  Regression discontinuity check for pre-treatment consumption
#
# Generated elsewhere: (A1) Palmer Drought Index, (A2) sample violation notice, (A3) sample WaterSmart report
#
# Notes:
# 1. All figures use only 'insample' households (insample == 1):
#    1. Burbank household in WaterSmart 'experimental' or 'control' arms (b1, b0)
#    2. Nonmissing pretreatment consumption: first observed no later than 2014-04-01
#    3. Nonmissing posttreatment consumption: is observed during 2015-05-01 through 2015-10-31
#    4. Nonmissing running variable for irrigation violation warnings 
#

#### PREAMBLE ####
rm(list = ls()) # Clear workspace
wd = "~/Dropbox/Shared/Watersmart/data" ; setwd(wd) # * SET THIS * Directory containing compiled WaterSmart data
figuresdir = "~/Dropbox/Overleaf/WFPR_Watersmart/figures" # * SET THIS * Directory in which to save generated figures
pl = c('data.table', 'fasttime', 'zoo', 'ggplot2', 'scales') ; lapply(pl, require, character.only = T) # R packages to load

#### READ IN DATA ####

# 1. Cross section of Burbank households
# 2. Panel of household daily consumption measures
# 3. Panel including all irrigation violation warnings

cs <- fread(file.path(wd, 'Analysis_Burbank_Cross_Section.csv'))
dcon <- fread(file.path(wd, 'Analysis_Burbank_Daily_Consumption.csv'))
viols <- fread(file.path(wd, 'Analysis_Burbank_Violations.csv'))
setkey(cs, residence_id) ; setkey(dcon, residence_id, readdate) ; setkey(viols, residence_id, violdate)

# Keep only 'insample' households (see definition in above note)
cs <- cs[insample == 1] ; dcon <- dcon[insample == 1] ; viols <- viols[insample == 1]

# Format dates
cs[, wsdate := as.Date(wsdate, format = '%Y-%m-%d')][, changeoccdate := as.Date(changeoccdate, format = '%Y-%m-%d')]
dcon[, readdate := as.Date(readdate, format = '%Y-%m-%d')]
viols[, violdate := as.Date(violdate, format = '%Y-%m-%d')]

# Generate week-of-sample indicator for consumption
dcon[, readwk := as.integer(floor(difftime(readdate, '2015-06-30', units = 'days')/7))]
dcon[, weekdate := min(readdate), by = readwk]

# Compute average weekly consumption by household: April 1st 2014 through December 31st 2015
dcon[, ts96 := 0L] ; dcon[wday(readdate) %in% c(3, 7), ts96 := b9gal + a6gal]
wcon <- dcon[weekdate >= as.Date('2014-04-01') & weekdate <= as.Date('2015-12-31'), 
             .(wgal = sum(dailygal), ts96 = sum(ts96)), by = .(residence_id, arm, weekdate)]
wcon[, nonts96 := wgal - ts96]
setkey(wcon, residence_id) ; setkey(cs, residence_id)
wcon <- merge(wcon, cs[, .(residence_id, autoviol, runningv)])

#### (1) WEEKLY TIME-SERIES OF CUMULATIVE HOUSEHOLD FRACTION WITH ANY IRRIGATION VIOLATION NOTICES ####

# Determine date of earliest observed violation notice for each household (if any), keeping only through 2015-12-31
viol1 <- viols[violdate <= as.Date('2015-12-31'), .(violdate = min(violdate)), by = .(residence_id)]

# Collapse to count of first violations notices per date
viol1cts <- viol1[, .(viol1s = .N), by = violdate]
setkey(viol1cts, violdate)

# Merge to balanced daily panel that has crosswalk to weekdate values
dailypan <- unique(dcon[readdate <= as.Date('2015-12-31'), .(violdate = readdate, weekdate)], by = NULL)
setkey(dailypan, violdate)
viol1pan <- merge(viol1cts, dailypan, all = T)

# Determine *cumulative* violations notices as of each week during 2015
viol1pan[is.na(viol1s), viol1s := 0]
viol1pan <- viol1pan[, .(viol1s = sum(viol1s)), by = weekdate]
setkey(viol1pan, weekdate)
viol1pan[, cumviols := cumsum(viol1s)]
viol1pan <- viol1pan[year(weekdate) == 2015 | weekdate == '2014-12-31', .(weekdate, cumviols)]

# Convert violation counts into fraction of residence_id that were cumulatively notified
viol1pan[, frac_notice := cumviols / length(wcon[weekdate == as.Date('2014-12-31')]$residence_id)]
viol1pan[, frac_new_notice := frac_notice - shift(frac_notice)]
viol1pan[weekdate == '2014-12-31', frac_new_notice := 0]

# Create and save time series graph: cumulative
ggplot(viol1pan, aes(x = weekdate, y = frac_notice * 100)) + geom_line() + geom_point() + theme_bw() +
  scale_x_date(name = 'Week of 2015', breaks = c(as.Date('2015-01-01'), as.Date('2015-04-01'), as.Date('2015-07-01'), 
                                                 as.Date('2015-10-01'), as.Date('2016-01-01')), date_labels = '%B') +
  scale_y_continuous(name = 'Cumulative share of households notified (percent)', limits = c(0, 45), breaks = c(0, 15, 30, 45)) +
  annotate('text', label = 'Automated algorithmic\ndetection of violations', x = as.Date('2015-04-20'), y = 40.5, color = 'firebrick') +
  geom_segment(aes(x = as.Date('2015-06-05'), y = 39.4, xend = as.Date('2015-06-25'), yend = 39.4), 
               arrow = arrow(length = unit(0.4, 'cm')), color = 'firebrick') +
  theme(text = element_text(size = 12))

ggsave(file.path(figuresdir, 'TimeSeries_Notices.pdf'), width = 8, height = 6, units = 'in', dpi = 200)

rm(viol1, viol1cts, dailypan, viol1pan)

#### (2) REGRESSION DISCONTINUITY FIRST STAGE FOR SENT AUTOMATED IRRIGATION VIOLATION WARNING ####

# Using a bandwidth of 80 and a binsize of 10 gallons
rd1s <- cs[abs(runningv) <= 80, .(runningv, autoviol)]
rd1s[runningv < 0, binrunningv := floor(runningv / 10) * 10 + 5.5]
rd1s[runningv > 0, binrunningv := ceiling(runningv / 10) * 10 - 5.5]

# Determine nonparametric predicted fit values using loess smoother
rd1s.below <- rd1s[abs(runningv) <= 80 & runningv < 0, .(runningv, autoviol)]
rd1s.above <- rd1s[abs(runningv) <= 80 & runningv > 0, .(runningv, autoviol)]
rd1s.below[, autoviol_pred := predict(loess(autoviol ~ runningv))]
rd1s.above[, autoviol_pred := predict(loess(autoviol ~ runningv))]

gg1s <- rd1s[abs(runningv) <= 80, .(ct = .N, autoviol = mean(100 * autoviol)), by = .(binrunningv)]
setkey(gg1s, binrunningv)

ggplot() + geom_point(data = gg1s, aes(x = binrunningv, y = autoviol, size = ct), show.legend = F, shape = 1) +
  geom_vline(xintercept = 0, color = 'firebrick') + theme_bw() + 
  scale_y_continuous(name = 'Share of households sent violation notice (percent)', limits = c(0, 100)) + 
  scale_x_continuous(name = 'Distance along running variable to irrigation violation cutoff (gallons)',
                     breaks = c(-80, -40, 0, 40, 80)) +
  theme(text = element_text(size = 12)) +
  geom_line(data = rd1s.below, aes(x = runningv, y = 100 * autoviol_pred)) +
  geom_line(data = rd1s.above, aes(x = runningv, y = 100 * autoviol_pred))

ggsave(file.path(figuresdir, 'RD_FirstStage_Notices.pdf'), width = 8, height = 6, units = 'in', dpi = 200)

rm(gg1s, rd1s, rd1s.above, rd1s.below)

#### (3, 4A, 4B) REGRESSION DISCONTINUITY REDUCED-FORM FOR POST-TREATMENT CONSUMPTION DURING VARIOUS HOUR GROUPINGS ####

# Regression discontinuity reduced-form for post-treatment total weekly consumption during all hours
# Regression discontinuity reduced-form for post-treatment total consumption during irrigation *non-allowed* hours
# Regression discontinuity reduced-form for post-treatment total consumption during irrigation *allowed* hours

# Compute post-treatment RD outcomes: July-October 2015, pooled average weekly outcomes

rdrf <- wcon[year(weekdate) == 2015 & month(weekdate) %in% 7:10, .(wgal = mean(wgal), nonts96 = mean(nonts96), ts96 = mean(ts96)),
             by = .(residence_id, runningv)]

# Subset households along running variable, using a bandwidth of 80 and a binsize of 10 gallons
rdrf <- rdrf[abs(runningv) <= 80]
rdrf[runningv < 0, binrunningv := floor(runningv / 10) * 10 + 5.5]
rdrf[runningv > 0, binrunningv := ceiling(runningv / 10) * 10 - 5.5]

# Determine nonparametric predicted fit values using loess smoother
rdrf.below <- rdrf[runningv < 0, .(runningv, wgal, nonts96, ts96)]
rdrf.above <- rdrf[runningv > 0, .(runningv, wgal, nonts96, ts96)]
rdrf.below[, wgal_pred := predict(loess(wgal ~ runningv))]
rdrf.below[, nonts96_pred := predict(loess(nonts96 ~ runningv))]
rdrf.below[, ts96_pred := predict(loess(ts96 ~ runningv))]
rdrf.above[, wgal_pred := predict(loess(wgal ~ runningv))]
rdrf.above[, nonts96_pred := predict(loess(nonts96 ~ runningv))]
rdrf.above[, ts96_pred := predict(loess(ts96 ~ runningv))]

# Collapse to local averages
ggrdrf <- rdrf[, .(ct = .N, wgal = mean(wgal), nonts96 = mean(nonts96), ts96 = mean(ts96)), by = binrunningv]
setkey(ggrdrf, binrunningv)

# A) Total weekly consumption during all hours
ggplot() + geom_point(data = ggrdrf, aes(x = binrunningv, y = wgal, size = ct), show.legend = F, shape = 1) +
  geom_vline(xintercept = 0, color = 'firebrick') + theme_bw() + 
  scale_y_continuous(name = 'Average weekly water consumption (gallons)', limits = c(1250, 2250)) + 
  scale_x_continuous(name = 'Distance along running variable to irrigation violation cutoff (gallons)',
                     breaks = c(-80, -40, 0, 40, 80)) +
  theme(text = element_text(size = 12)) +
  geom_line(data = rdrf.below, aes(x = runningv, y = wgal_pred)) +
  geom_line(data = rdrf.above, aes(x = runningv, y = wgal_pred))

ggsave(file.path(figuresdir, 'RD_ReducedForm_Weekly.pdf'), width = 8, height = 6, units = 'in', dpi = 200)

# B) Total weekly consumption during * irrigation not allowed * hours
ggplot() + geom_point(data = ggrdrf, aes(x = binrunningv, y = nonts96, size = ct), show.legend = F, shape = 1) +
  geom_vline(xintercept = 0, color = 'firebrick') + theme_bw() + 
  scale_y_continuous(name = 'Average weekly water consumption (gallons)', limits = c(750, 1750)) + 
  scale_x_continuous(name = 'Distance along running variable to irrigation violation cutoff (gallons)',
                     breaks = c(-80, -40, 0, 40, 80)) +
  theme(text = element_text(size = 14)) +
  geom_line(data = rdrf.below, aes(x = runningv, y = nonts96_pred)) +
  geom_line(data = rdrf.above, aes(x = runningv, y = nonts96_pred))

ggsave(file.path(figuresdir, 'RD_ReducedForm_NotAllowed.pdf'), width = 8, height = 6, units = 'in', dpi = 200)

# C) Total weekly consumption during * irrigation allowed * hours
ggplot() + geom_point(data = ggrdrf, aes(x = binrunningv, y = ts96, size = ct), show.legend = F, shape = 1) +
  geom_vline(xintercept = 0, color = 'firebrick') + theme_bw() + 
  scale_y_continuous(name = 'Average weekly water consumption (gallons)', limits = c(0, 1000)) + 
  scale_x_continuous(name = 'Distance along running variable to irrigation violation cutoff (gallons)',
                     breaks = c(-80, -40, 0, 40, 80)) +
  theme(text = element_text(size = 14)) +
  geom_line(data = rdrf.below, aes(x = runningv, y = ts96_pred)) +
  geom_line(data = rdrf.above, aes(x = runningv, y = ts96_pred))

ggsave(file.path(figuresdir, 'RD_ReducedForm_Allowed.pdf'), width = 8, height = 6, units = 'in', dpi = 200)

rm(rdrf, rdrf.above, rdrf.below, ggrdrf)

#### (5) MONTHLY AVERAGE WEEKLY CONSUMPTION BY RCT ARM ACROSS ALL THREE 'REGIMES' ####

# Average weekly gallons for each household in each month
wcon[, monthdate := as.Date(paste0(substr(weekdate, 1, 8), '01'))]
mcon <- wcon[, .(wgal = mean(wgal)), by = .(residence_id, arm, monthdate)]

# Average weekly gallons for each RCT arm in each month: 2014-11 through 2015-10
ggmcon <- mcon[monthdate >= '2014-11-01' & monthdate <= '2015-10-01', .(wgal = mean(wgal)), by = .(monthdate, arm)]
ggmcon[, monthdate := monthdate + 14]

ggplot(ggmcon, aes(x = monthdate, y = wgal, group = arm, color = arm, shape = arm)) + geom_point(size = 2.5) +   
  scale_y_continuous(name = 'Average weekly water consumption (gallons)', limits = c(1500, 3000)) + 
  scale_x_date(name = '', breaks = c(as.Date('2014-11-01'), as.Date('2015-04-01'), as.Date('2015-07-01'), as.Date('2015-11-01')), 
               labels = c('Nov. 2014', 'April 2015', 'July 2015', 'Nov. 2015'), limits = as.Date(c('2014-11-01', '2015-11-01'))) +
  scale_shape_manual(name = 'WaterSmart treatment arm', labels = c('Control', 'Home water reports (HWR)'), values = c(1, 19)) +
  scale_color_manual(name = 'WaterSmart treatment arm', labels = c('Control', 'Home water reports (HWR)'), values = c('firebrick', 'steelblue')) +
  theme_bw() + theme(legend.position = c(0.82, 0.89), legend.background = element_rect(color = 'black', fill = 'white')) +
  theme(text = element_text(size = 12)) + theme(panel.grid.minor = element_blank()) +
  geom_vline(xintercept = as.Date('2015-04-01'), linetype = 'dashed', color = 'grey40') + 
  geom_vline(xintercept = as.Date('2015-07-01'), linetype = 'dashed', color = 'grey40') +
  annotate('text', label = 'Pre-treatments', x = as.Date('2015-01-15'), y = 1550, color = 'grey40', size = 5) +
  annotate('text', label = 'HWR only', x = as.Date('2015-05-15'), y = 1550, color = 'grey40', size = 5) +
  annotate('text', label = 'HWR + auto. violations', x = as.Date('2015-09-01'), y = 1550, color = 'grey40', size = 5)

ggsave(file.path(figuresdir, 'RCT_TimeSeries_Regimes.pdf'), width = 8, height = 6, units = 'in', dpi = 200)

rm(mcon, ggmcon)

#### (A4) DISTRIBUTIONS OF PRE-TREATMENT CONSUMPTION BY RCT ARM ####

# Using full year of pre-treatment consumption spanning 2014-04-01 through 2015-03-31

ptuse <- wcon[weekdate >= as.Date('2014-04-01') & weekdate <= as.Date('2015-03-31'), .(wgal = mean(wgal, na.rm = T)), by = .(residence_id, arm)]
options(scipen = 10000)

ggplot(ptuse, aes(x = wgal, color = arm, linetype = arm, group = arm)) + 
  geom_line(stat = 'density', bw = 500, size = 0.5) +
  theme_bw() + scale_y_continuous(name = 'Density of households') + 
  scale_x_continuous(name = 'Pre-treatment average weekly water consumption (gallons)') +
  scale_linetype_manual(name = 'WaterSmart treatment arm', labels = c('Control', 'Home water reports (HWR)'), values = c('dashed', 'solid')) +
  scale_color_manual(name = 'WaterSmart treatment arm', labels = c('Control', 'Home water reports (HWR)'), values = c('firebrick', 'steelblue')) +
  theme_bw() + theme(legend.position = c(0.82, 0.89), legend.background = element_rect(color = 'black', fill = 'white')) +
  theme(text = element_text(size = 12)) + theme(panel.grid.minor = element_blank())

ggsave(file.path(figuresdir, 'RCT_Pretreatment_Distributions.pdf'), width = 8, height = 6, units = 'in', dpi = 200)

rm(ptuse)

#### (A5) MCCRARY TEST FOR BUNCHING IN THE REGRESSION DISCONTINUITY DESIGN ####

# Using a bandwidth of 80 and a binsize of 1 (i.e. not binning) for this check

ggplot(cs[abs(runningv) <= 80], aes(x = runningv)) + geom_histogram(binwidth = 1) +
  geom_vline(xintercept = 0, color = 'firebrick') + theme_bw() +
  scale_y_continuous(name = 'Number of households', breaks = c(0, 250, 500, 750)) + 
  scale_x_continuous(name = 'Distance along running variable to irrigation violation cutoff (gallons)',
                     breaks = c(-80, -40, 0, 40, 80)) +
  geom_density(aes(x = runningv)) + theme(text = element_text(size = 12))

ggsave(file.path(figuresdir, 'RD_BunchingTest.pdf'), width = 8, height = 6, units = 'in', dpi = 200)

#### (A6) REGRESSION DISCONTINUITY PRE-TREATMENT CONSUMPTION CHECK ####

# As above, using full year of pre-treatment consumption spanning 2014-04-01 through 2015-03-31
rdpre <- wcon[weekdate >= as.Date('2014-04-01') & weekdate <= as.Date('2015-03-31'), .(wgal = mean(wgal, na.rm = T)), by = .(residence_id, runningv)]
rdpre[runningv < 0, binrunningv := floor(runningv / 10) * 10 + 5.5]
rdpre[runningv > 0, binrunningv := ceiling(runningv / 10) * 10 - 5.5]

ggrdpre <- rdpre[abs(runningv) <= 80, .(ct = .N, wgal = mean(wgal)), by = .(binrunningv)]

# Determine nonparametric predicted fit values using loess smoother
rdpre.below <- rdpre[abs(runningv) <= 80 & runningv < 0, .(runningv, wgal)]
rdpre.above <- rdpre[abs(runningv) <= 80 & runningv > 0, .(runningv, wgal)]
rdpre.below[, wgal_pred := predict(loess(wgal ~ runningv))]
rdpre.above[, wgal_pred := predict(loess(wgal ~ runningv))]

ggplot() + geom_point(data = ggrdpre, aes(x = binrunningv, y = wgal, size = ct), show.legend = F, shape = 1) +
  geom_vline(xintercept = 0, color = 'firebrick') + theme_bw() + 
  scale_y_continuous(name = 'Pre-treatment average weekly consumption (gallons)', limits = c(1500, 3000)) + 
  scale_x_continuous(name = 'Distance along running variable to irrigation violation cutoff (gallons)',
                     breaks = c(-80, -40, 0, 40, 80)) +
  theme(text = element_text(size = 12)) +
  geom_line(data = rdpre.below, aes(x = runningv, y = wgal_pred)) +
  geom_line(data = rdpre.above, aes(x = runningv, y = wgal_pred))

ggsave(file.path(figuresdir, 'RD_ReducedForm_Pretreatment.pdf'), width = 8, height = 6, units = 'in', dpi = 200)

rm(ggrdpre, rdpre, rdpre.below, rdpre.above)

## END
